# ██████╗ ██╗ ██╗██╗ ██╗██╗ ██████╗ ██████╗ ██╗ ██╗███████╗██████╗ ███████╗
# ██╔══██╗██║ ██║╚██╗ ██╔╝██║ ██╔═══██╗██╔══██╗██║ ██║██╔════╝██╔══██╗██╔════╝
# ██████╔╝███████║ ╚████╔╝ ██║ ██║ ██║██████╔╝███████║█████╗ ██████╔╝█████╗
# ██╔═══╝ ██╔══██║ ╚██╔╝ ██║ ██║ ██║██╔═══╝ ██╔══██║██╔══╝ ██╔══██╗██╔══╝
# ██║ ██║ ██║ ██║ ███████╗╚██████╔╝██║ ██║ ██║███████╗██║ ██║███████╗
# ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚══════╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝╚══════╝╚═╝ ╚═╝╚══════╝
## PHYLOPHERE: A Nextflow pipeline including a complete set
## of phylogenetic comparative tools and analyses for Phenome-Genome studies
### Github: https://github.com/nozerorma/caastools/nf-phylophere
### Author: Miguel Ramon (miguel.ramon@upf.edu)
#### File: CI-composition.Rmd
This script computes 95% credibility intervals (CIs) for the trait of interest:
This section configures the working environment, sets directories, and loads necessary functions and libraries.
# Call the setup function from commons.R
source("./obj/commons.R")
## [DEBUG] args = cancer_traits_processed-LQ.csv | science.abn7829_data_s4.nex.tree | data_exploration | 1998 | primates | family | neoplasia_prevalence | adult_necropsy_count | neoplasia_necropsy | /home/miguel/IBE-UPF/PhD/PhyloPhere/Data/5.Phylogeny/taxid_species_family.tsv | malignant_prevalence | LQ
## [DEBUG] seed_val = 1998
## [DEBUG] workingDir = /home/miguel/IBE-UPF/PhD/PhyloPhere/work/5e/1d08697bd537e1c303875efe635be0
## [DEBUG] objDir = /home/miguel/IBE-UPF/PhD/PhyloPhere/work/5e/1d08697bd537e1c303875efe635be0/obj
## [DEBUG] resultsDir = data_exploration
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## [1] "Working Directory: /home/miguel/IBE-UPF/PhD/PhyloPhere/work/5e/1d08697bd537e1c303875efe635be0"
## [1] "Results Directory: data_exploration"
## [DEBUG] Working Directory: /home/miguel/IBE-UPF/PhD/PhyloPhere/work/5e/1d08697bd537e1c303875efe635be0
## [DEBUG] Results Directory: data_exploration
## [DEBUG] data_exploration_dir = data_exploration/1.Data-exploration
## [DEBUG] species_distribution_dir = data_exploration/1.Data-exploration/1.Species_distribution
## [DEBUG] extreme_plots_dir = data_exploration/1.Data-exploration/2.Extreme_plots
## [DEBUG] phylo_distribution_dir = data_exploration/1.Data-exploration/3.Phylogenetic_distribution
## [DEBUG] ci_dir = data_exploration/1.Data-exploration/4.CI_overlaps
## [DEBUG] palettes loaded: primates=15, primates_dark=15, mammals=17
## [1] "Loading trait data from: cancer_traits_processed-LQ.csv"
## [DEBUG] trait_path = cancer_traits_processed-LQ.csv
## [DEBUG] trait_df rows = 47, cols = 19
## [DEBUG] trait_df columns: tax_id, species, common_name, family, adult_necropsy_count, neoplasia_necropsy, neoplasia_prevalence, benign_count, benign_prevalence, malignant_count, malignant_prevalence, body_mass, mass_g, log_body_mass, mature_f, mature_m, mature_age, MLS.anage, LQ
## [DEBUG] trait_df species unique = 47
## [DEBUG] tree_path = science.abn7829_data_s4.nex.tree
## [DEBUG] tree tips = 236, nodes = 235
## [DEBUG] clade_name = primates
## [DEBUG] taxon_of_interest = family
## [DEBUG] trait = neoplasia_prevalence
## [DEBUG] n_trait = neoplasia_necropsy
## [DEBUG] p_trait = adult_necropsy_count
## [DEBUG] has.p = TRUE, p missing = 0
## [DEBUG] has.n = TRUE, n missing = 0
## Warning: package 'ape' was built under R version 4.4.2
##
## Attaching package: 'ape'
## The following object is masked from 'package:dplyr':
##
## where
## [DEBUG] tax_id_file = /home/miguel/IBE-UPF/PhD/PhyloPhere/Data/5.Phylogeny/taxid_species_family.tsv
## [DEBUG] tax_id_df rows = 1290, distinct taxa = 807
## [DEBUG] has.TAX_ID = TRUE
## [DEBUG] trait_df merged with tax_id_df: rows = 47, missing tax_id = 0
## [DEBUG] normalized tax_id from merged columns, missing tax_id = 0
## [DEBUG] tree_ids rows = 40, missing tax_id = 0
## [DEBUG] common_tax_ids = 40
## [DEBUG] pruned_tree tips (TAX_ID) = 40, nodes = 39
## [DEBUG] trait_df after TAX_ID tree filter rows = 40
## [DEBUG] secondary_trait = malignant_prevalence
## [DEBUG] has.secondary = TRUE, missing = 0
## [DEBUG] branch_trait = LQ
## [DEBUG] has.branch = TRUE, missing = 0
setup_rmd()
# Debug helper (prints into HTML output)
if (is.null(getOption("phylo_debug"))) {
options(phylo_debug = TRUE)
}
if (!exists("phylo_debug_log", envir = .GlobalEnv)) {
phylo_debug_log <- character()
}
if (!exists("debug_log", inherits = TRUE)) {
debug_log <- function(...) {
msg <- sprintf(...)
phylo_debug_log <<- c(phylo_debug_log, msg)
cat("[DEBUG] ", msg, "\n", sep = "")
}
}
# Load additional libraries
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(patchwork)
library(ggtext)
color_palette <- paste0(clade_name, "_palette") # Color palette for the clade
capitalized_taxon <- tools::toTitleCase(taxon_of_interest) # Capitalized taxon name
capitalized_trait <- tools::toTitleCase(gsub("_", " ", trait)) # Capitalized and deconvoluted trait name
capitalized_clade <- tools::toTitleCase(gsub("_", " ", clade_name)) # Capitalized and deconvoluted clade name
createDir(ci_dir)
## [DEBUG] createDir: created data_exploration/1.Data-exploration/4.CI_overlaps
This section calculates 95% confidence intervals for malignant and neoplasia prevalence per species using both Wilson and Jeffreys methods.
# Compute CIs directly using DescTools and binom packages
trait_ci <- trait_df %>%
group_by(species) %>%
dplyr::rename(n_population = .data[[p_trait]],
n_observation = .data[[n_trait]],
trait = .data[[trait]]) %>%
dplyr::mutate(
# Jeffreys method CIs
trait_ci_lb = binom::binom.confint(n_observation, n_population,
method = "bayes", priors = c(0.5, 0.5),
conf.level = 0.95)[, 5],
trait_ci_ub = binom::binom.confint(n_observation, n_population,
method = "bayes", priors = c(0.5, 0.5),
conf.level = 0.95)[, 6],
trait_ci = paste0("[", round(trait_ci_lb, 2), "-", round(trait_ci_ub, 2), "]")
) %>%
ungroup()
## Warning: Use of .data in tidyselect expressions was deprecated in tidyselect 1.2.0.
## ℹ Please use `all_of(var)` (or `any_of(var)`) instead of `.data[[var]]`
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Save results
write.csv(trait_ci, file.path(ci_dir, "trait_CI.csv"),
quote = FALSE, row.names = FALSE)
# Display preview
head(trait_ci)
## # A tibble: 6 × 22
## species common_name family n_population n_observation trait benign_count
## <chr> <chr> <chr> <int> <int> <dbl> <int>
## 1 Alouatta_ca… Black howl… Ateli… 32 4 0.125 3
## 2 Ateles_geof… Black-hand… Ateli… 43 13 0.302 5
## 3 Callimico_g… Goeldi's m… Cebid… 69 15 0.217 8
## 4 Callithrix_… White-fron… Cebid… 31 3 0.0968 2
## 5 Callithrix_… Common mar… Cebid… 41 1 0.0244 0
## 6 Cebuella_py… Pygmy marm… Cebid… 41 4 0.0976 2
## # ℹ 15 more variables: benign_prevalence <dbl>, malignant_count <int>,
## # malignant_prevalence <dbl>, body_mass <dbl>, mass_g <dbl>,
## # log_body_mass <dbl>, mature_f <dbl>, mature_m <dbl>, mature_age <dbl>,
## # MLS.anage <dbl>, LQ <dbl>, tax_id <int>, trait_ci_lb <dbl>,
## # trait_ci_ub <dbl>, trait_ci <chr>
This section prepares data for comparing cancer prevalence between all pairs of species.
# Helper function to create pairwise comparisons for a given method
create_pairwise_data <- function(ci_data) {
# Define column names for lower and upper bounds
ci_subset <- ci_data %>%
dplyr::select(species, n_population,
n_observation, trait,
trait_ci_lb, trait_ci_ub)
# Generate all pairwise species combinations
pairwise <- expand.grid(species1 = ci_subset$species,
species2 = ci_subset$species,
stringsAsFactors = FALSE)
# Merge species data for each pair
pairwise <- pairwise %>%
left_join(ci_subset %>% rename_with(~paste0(.x, "1"), .cols = -species),
by = c("species1" = "species")) %>%
left_join(ci_subset %>% rename_with(~paste0(.x, "2"), .cols = -species),
by = c("species2" = "species"))
pairwise <- pairwise %>%
mutate(
trait_diff = trait1 - trait2,
trait_overlap = ci_overlap(
trait_ci_lb1, trait_ci_ub1, trait_ci_lb2, trait_ci_ub2
),
diff_value = trait_diff,
overlap_use = trait_overlap
)
return(pairwise)
}
# Create pairwise data
pairwise_data <- create_pairwise_data(trait_ci)
# Save results
write.csv(pairwise_data, file.path(ci_dir, "pairwise_data.csv"),
quote = FALSE, row.names = FALSE)
# Display preview
head(pairwise_data)
## species1 species2 n_population1 n_observation1 trait1
## 1 Alouatta_caraya Alouatta_caraya 32 4 0.12500000
## 2 Ateles_geoffroyi Alouatta_caraya 43 13 0.30232558
## 3 Callimico_goeldii Alouatta_caraya 69 15 0.21739130
## 4 Callithrix_geoffroyi Alouatta_caraya 31 3 0.09677419
## 5 Callithrix_jacchus Alouatta_caraya 41 1 0.02439024
## 6 Cebuella_pygmaea Alouatta_caraya 41 4 0.09756098
## trait_ci_lb1 trait_ci_ub1 n_population2 n_observation2 trait2 trait_ci_lb2
## 1 3.352372e-02 0.25257350 32 4 0.125 0.03352372
## 2 1.758696e-01 0.44262825 32 4 0.125 0.03352372
## 3 1.281707e-01 0.31916086 32 4 0.125 0.03352372
## 4 1.812308e-02 0.21626214 32 4 0.125 0.03352372
## 5 4.695278e-05 0.09147094 32 4 0.125 0.03352372
## 6 2.547171e-02 0.20034261 32 4 0.125 0.03352372
## trait_ci_ub2 trait_diff trait_overlap diff_value overlap_use
## 1 0.2525735 0.00000000 TRUE 0.00000000 TRUE
## 2 0.2525735 0.17732558 TRUE 0.17732558 TRUE
## 3 0.2525735 0.09239130 TRUE 0.09239130 TRUE
## 4 0.2525735 -0.02822581 TRUE -0.02822581 TRUE
## 5 0.2525735 -0.10060976 TRUE -0.10060976 TRUE
## 6 0.2525735 -0.02743902 TRUE -0.02743902 TRUE
This section transforms pairwise data into matrix format for visualization.
# Helper function to build difference matrix
build_diff_matrix <- function(pairwise_data) {
pairwise_data %>%
select(species1, species2, diff_value) %>%
pivot_wider(names_from = species2, values_from = diff_value) %>%
column_to_rownames("species1") %>%
as.matrix()
}
# Create matrices
matrix <- build_diff_matrix(pairwise_data)
# Save matrices
write.csv(matrix, file.path(ci_dir, "diff_matrix-CI.csv"),
quote = FALSE, row.names = TRUE)
# Display preview
head(matrix)
## Alouatta_caraya Ateles_geoffroyi Callimico_goeldii
## Alouatta_caraya 0.00000000 -0.17732558 -0.09239130
## Ateles_geoffroyi 0.17732558 0.00000000 0.08493428
## Callimico_goeldii 0.09239130 -0.08493428 0.00000000
## Callithrix_geoffroyi -0.02822581 -0.20555139 -0.12061711
## Callithrix_jacchus -0.10060976 -0.27793534 -0.19300106
## Cebuella_pygmaea -0.02743902 -0.20476461 -0.11983033
## Callithrix_geoffroyi Callithrix_jacchus Cebuella_pygmaea
## Alouatta_caraya 0.0282258065 0.10060976 0.0274390244
## Ateles_geoffroyi 0.2055513878 0.27793534 0.2047646058
## Callimico_goeldii 0.1206171108 0.19300106 0.1198303287
## Callithrix_geoffroyi 0.0000000000 0.07238395 -0.0007867821
## Callithrix_jacchus -0.0723839496 0.00000000 -0.0731707317
## Cebuella_pygmaea 0.0007867821 0.07317073 0.0000000000
## Cercopithecus_neglectus Colobus_guereza Eulemur_macaco
## Alouatta_caraya 0.05357143 0.021907216 -0.175000000
## Ateles_geoffroyi 0.23089701 0.199232798 0.002325581
## Callimico_goeldii 0.14596273 0.114298521 -0.082608696
## Callithrix_geoffroyi 0.02534562 -0.006318590 -0.203225806
## Callithrix_jacchus -0.04703833 -0.078702540 -0.275609756
## Cebuella_pygmaea 0.02613240 -0.005531808 -0.202439024
## Galago_moholi Galago_senegalensis Gorilla_gorilla
## Alouatta_caraya 0.031250000 0.109848485 -0.08395522
## Ateles_geoffroyi 0.208575581 0.287174066 0.09337036
## Callimico_goeldii 0.123641304 0.202239789 0.00843608
## Callithrix_geoffroyi 0.003024194 0.081622678 -0.11218103
## Callithrix_jacchus -0.069359756 0.009238729 -0.18456498
## Cebuella_pygmaea 0.003810976 0.082409460 -0.11139425
## Hylobates_lar Lemur_catta Leontopithecus_chrysomelas
## Alouatta_caraya -0.03409091 -0.03073770 0.007352941
## Ateles_geoffroyi 0.14323467 0.14658788 0.184678523
## Callimico_goeldii 0.05830040 0.06165360 0.099744246
## Callithrix_geoffroyi -0.06231672 -0.05896351 -0.020872865
## Callithrix_jacchus -0.13470067 -0.13134746 -0.093256815
## Cebuella_pygmaea -0.06152993 -0.05817673 -0.020086083
## Leontopithecus_rosalia Macaca_fascicularis Macaca_fuscata
## Alouatta_caraya 0.002777778 0.07094595 0.031976744
## Ateles_geoffroyi 0.180103359 0.24827153 0.209302326
## Callimico_goeldii 0.095169082 0.16333725 0.124368049
## Callithrix_geoffroyi -0.025448029 0.04272014 0.003750938
## Callithrix_jacchus -0.097831978 -0.02966381 -0.068633012
## Cebuella_pygmaea -0.024661247 0.04350692 0.004537720
## Macaca_nigra Macaca_silenus Mandrillus_sphinx
## Alouatta_caraya 0.097222222 0.036764706 0.036111111
## Ateles_geoffroyi 0.274547804 0.214090287 0.213436693
## Callimico_goeldii 0.189613527 0.129156010 0.128502415
## Callithrix_geoffroyi 0.068996416 0.008538899 0.007885305
## Callithrix_jacchus -0.003387534 -0.063845050 -0.064498645
## Cebuella_pygmaea 0.069783198 0.009325681 0.008672087
## Mico_argentatus Microcebus_murinus Nycticebus_coucang
## Alouatta_caraya 0.07500000 -0.12061404 0.04166667
## Ateles_geoffroyi 0.25232558 0.05671155 0.21899225
## Callimico_goeldii 0.16739130 -0.02822273 0.13405797
## Callithrix_geoffroyi 0.04677419 -0.14883984 0.01344086
## Callithrix_jacchus -0.02560976 -0.22122379 -0.05894309
## Cebuella_pygmaea 0.04756098 -0.14805306 0.01422764
## Nycticebus_pygmaeus Pan_troglodytes Papio_hamadryas
## Alouatta_caraya -0.14030612 -0.04342105 0.07645631
## Ateles_geoffroyi 0.03701946 0.13390453 0.25378189
## Callimico_goeldii -0.04791482 0.04897025 0.16884762
## Callithrix_geoffroyi -0.16853193 -0.07164686 0.04823050
## Callithrix_jacchus -0.24091588 -0.14403081 -0.02415345
## Cebuella_pygmaea -0.16774515 -0.07086008 0.04901729
## Propithecus_coquereli Saguinus_bicolor Saguinus_imperator
## Alouatta_caraya -0.03500000 0.08500000 0.12500000
## Ateles_geoffroyi 0.14232558 0.26232558 0.30232558
## Callimico_goeldii 0.05739130 0.17739130 0.21739130
## Callithrix_geoffroyi -0.06322581 0.05677419 0.09677419
## Callithrix_jacchus -0.13560976 -0.01560976 0.02439024
## Cebuella_pygmaea -0.06243902 0.05756098 0.09756098
## Saguinus_midas Saguinus_oedipus Saimiri_sciureus
## Alouatta_caraya 0.12500000 -0.04664179 0.04741379
## Ateles_geoffroyi 0.30232558 0.13068379 0.22473937
## Callimico_goeldii 0.21739130 0.04574951 0.13980510
## Callithrix_geoffroyi 0.09677419 -0.07486760 0.01918799
## Callithrix_jacchus 0.02439024 -0.14725155 -0.05319596
## Cebuella_pygmaea 0.09756098 -0.07408082 0.01997477
## Sapajus_apella Theropithecus_gelada Trachypithecus_auratus
## Alouatta_caraya 0.009615385 0.105769231 0.08796296
## Ateles_geoffroyi 0.186940966 0.283094812 0.26528854
## Callimico_goeldii 0.102006689 0.198160535 0.18035427
## Callithrix_geoffroyi -0.018610422 0.077543424 0.05973716
## Callithrix_jacchus -0.090994371 0.005159475 -0.01264679
## Cebuella_pygmaea -0.017823640 0.078330206 0.06052394
## Trachypithecus_cristatus Trachypithecus_francoisi
## Alouatta_caraya 0.08928571 -0.008333333
## Ateles_geoffroyi 0.26661130 0.168992248
## Callimico_goeldii 0.18167702 0.084057971
## Callithrix_geoffroyi 0.06105991 -0.036559140
## Callithrix_jacchus -0.01132404 -0.108943089
## Cebuella_pygmaea 0.06184669 -0.035772358
## Varecia_rubra Varecia_variegata
## Alouatta_caraya -0.06144068 -0.085526316
## Ateles_geoffroyi 0.11588490 0.091799266
## Callimico_goeldii 0.03095063 0.006864989
## Callithrix_geoffroyi -0.08966648 -0.113752122
## Callithrix_jacchus -0.16205043 -0.186136072
## Cebuella_pygmaea -0.08887970 -0.112965340
This section creates heatmaps to visualize pairwise differences in prevalence. Species labels are colored based on prevalence quantiles.
# Simplified function to create colored labels based on trait quantiles
create_colored_labels <- function(trait_data) {
# Calculate quantile thresholds
q75 <- quantile(trait_data$trait, 0.75, na.rm = TRUE)
q25 <- quantile(trait_data$trait, 0.25, na.rm = TRUE)
# Create a lookup table with species and their colors
label_df <- trait_data %>%
mutate(
color = case_when(
trait >= q75 ~ "#D73027", # Red for top quartile
trait <= q25 ~ "#1A9850", # Green for bottom quartile
TRUE ~ "#000000" # Black for middle
)
) %>%
select(species, color)
# Create named vector of colored HTML labels
colored_labels <- setNames(
paste0("<span style='color:", label_df$color, ";'>", label_df$species, "</span>"),
label_df$species
)
return(colored_labels)
}
# Helper function to create legend plot
create_legend_plot <- function() {
legend_df <- tibble(
category = c("Top", "Bottom"),
color = c("#D73027", "#1A9850")
)
ggplot(legend_df, aes(x = 1, y = category, color = category)) +
geom_point(size = 0) +
scale_color_manual(
values = setNames(legend_df$color, legend_df$category),
guide = guide_legend(override.aes = list(size = 5))
) +
labs(color = "Trait Categories") +
theme_void() +
theme(
legend.position = "right",
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)
}
# Helper function to create heatmap
create_heatmap <- function(pairwise_data, colored_labels,
highlight_nonoverlap = TRUE) {
# Ensure label mapping is stable across axes
if (is.null(names(colored_labels))) {
stop("colored_labels must be a named vector with species names.")
}
pairwise_data <- pairwise_data %>%
mutate(
species1 = factor(species1, levels = names(colored_labels)),
species2 = factor(species2, levels = names(colored_labels))
)
p <- ggplot(pairwise_data, aes(x = species2, y = species1, fill = abs(diff_value))) +
geom_tile(color = "black", size = 0.25) +
geom_text(aes(label = sprintf("%.2f", abs(diff_value))), size = 3) +
scale_fill_gradient(low = "white", high = "salmon3", name = "Difference") +
scale_x_discrete(labels = function(x) colored_labels[x]) +
scale_y_discrete(labels = function(y) colored_labels[y]) +
theme_minimal() +
theme(
axis.text.x = element_markdown(angle = 45, hjust = 1),
axis.text.y = element_markdown(angle = 0),
panel.grid = element_blank(),
legend.position = "none"
)
# Add bold border for non-overlapping CIs if requested
if (highlight_nonoverlap) {
p <- p +
geom_tile(data = filter(pairwise_data, !overlap_use),
fill = NA, color = "black", size = 1.5) +
labs(
x = "Species", y = "Species",
title = paste("Pairwise Differences in trait: ", capitalized_trait, " with Non-overlapping CIs", sep = ""),
caption = "Bold border: non-overlapping CIs."
)
} else {
p <- p +
labs(
x = "Species", y = "Species",
title = paste("Pairwise Differences in trait: ", capitalized_trait, sep = ""),
caption = "Bold border: non-overlapping CIs."
)
}
return(p)
}
# Create colored labels directly from trait data
my_labels_colored <- create_colored_labels(trait_ci)
# Save colored labels
write.csv(my_labels_colored, file.path(ci_dir, "my_labels_colored.csv"),
quote = FALSE, row.names = TRUE)
# Create legend
legend_plot <- create_legend_plot()
# Create heatmaps with non-overlapping CI highlights
p_overlap <- create_heatmap(pairwise_data, my_labels_colored, highlight_nonoverlap = TRUE) +
legend_plot + plot_layout(ncol = 2, widths = c(1000, 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p_flat <- create_heatmap(pairwise_data, my_labels_colored, highlight_nonoverlap = FALSE) +
legend_plot + plot_layout(ncol = 2, widths = c(1000, 1))
# Save heatmaps
ggsave(file.path(ci_dir, "visual_matrix_no_overlap.png"), p_overlap,
width = 15, height = 12, dpi = "retina")
ggsave(file.path(ci_dir, "visual_matrix_flat.png"), p_flat,
width = 15, height = 12, dpi = "retina")
# Display plots
p_overlap
p_flat